load("data/02a.f_val_dist_small.Rdata")
load("data/02b.rq_coef_small.Rdata")
## Intra distances
# list processed files
processed <- list.files("data", pattern = "04\\.", full.names = TRUE)
# load data
res <- sapply(processed, function(x) mget(load(x)), simplify = TRUE)
# get summary data (1st line)
df_intra <- res[1,] %>% bind_rows()
# get distances (2nd line)
df_intra_dist <- res[2,] %>% bind_rows()
## Drop colonial Collodaria because of segmentation artifact
df_intra <- df_intra %>% filter(!str_detect(taxon, "Collodaria_colonial"))
df_intra_dist <- df_intra_dist %>% filter(!str_detect(taxon, "Collodaria_colonial"))
## Assign colour and shape to each taxon
df_intra <- df_intra %>%
mutate(
colour = as.character(paletteer_d(`"khroma::discreterainbow"`, n = 27)),
shape = rep(21:25, 6)[1:27]
)Investigate intra distances for Doliolida
Prepare data
Load distances
Process null distances
# Apply log transformation for plotting
f_val_dist <- f_val_dist %>%
mutate(
log_n_dist = log10(n_dist),
log_test_stat = log10(test_stat)
)
## Generate data to plot a ribbon between the regression lines
# limits for n_dist
lim_dist <- c(50, 2e9)
# Generate ribbon
rib_data <- tibble(n_dist = lim_dist) %>%
mutate(
# apply log-transformation
log_n_dist = log10(n_dist),
# compute estimated kuiper-stat from slope and intercept
ymin = rq_coef %>% filter(tau == 0.05) %>% pull(slope) * log_n_dist + rq_coef %>% filter(tau == 0.05) %>% pull(intercept),
ymax = rq_coef %>% filter(tau == 0.95) %>% pull(slope) * log_n_dist + rq_coef %>% filter(tau == 0.95) %>% pull(intercept)
) %>%
# reformat and reorder to plot a polygon
pivot_longer(ymin:ymax, values_to = "y") %>%
mutate(order = c(1, 2, 4, 3)) %>%
arrange(order)Process plankton distances
df_intra_dist_small <- df_intra_dist %>%
pivot_longer(dist:dist_rand) %>%
mutate(value = value * 51 / 10000) %>%
left_join(plankton_esd, by = join_by(taxon)) #%>%
# Perform Kuiper test using only small distances
df_intra_small <- mclapply(df_intra$taxon, function (ta) {
# Get distances
d1 <- df_intra_dist_small %>% filter(taxon == ta) %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- df_intra_dist_small %>% filter(taxon == ta) %>% filter(name == "dist_rand") %>% pull(value) # null
# Number of distances
n_dist_small <- (length(d1) + length(d2)) / 2
# Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
return(
tibble(
taxon = ta,
prop_dist_small = n_dist_small / 10000, # proportion of small distances based on the quantiles
test_stat = kt[1],
p_value = kt[2]
))
}, mc.cores = n_cores) %>%
bind_rows()
# compute number of small distances based on proportion of small distances and the total number of distances
df_intra_small <- df_intra %>%
select(taxon, n_dist, colour, shape) %>%
left_join(df_intra_small, by = join_by(taxon)) %>%
mutate(
n_dist_small = round(n_dist * prop_dist_small), # total number of small distances
log_n_dist_small = log10(n_dist_small), # log transform number of small distances
log_test_stat = log10(test_stat) # log transform test stat
) %>%
filter(n_dist_small > 100) # keep only taxons with at least 100 small distances
# Detect taxa which are above the ribbon and have the minimum number of distances required
df_intra_small <- df_intra_small %>%
mutate(
above = log_test_stat > log_n_dist_small * (rq_coef %>% filter(tau == 0.95) %>% pull(slope)) + (rq_coef %>% filter(tau == 0.95) %>% pull(intercept)), # above the polygon
keep = n_dist_small > n_dist_min, # enough distances
above = above & keep # combination of both
) %>%
select(-keep)
df_intra_small_above <- df_intra_small %>% filter(above) # needed to keep the same colour and shape across plotsResults
Distribution of distances
df_intra_dist_small <- df_intra_dist_small %>%
mutate(name = ifelse(name == "dist", "plankton", "null"))
p1a <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
ggplot() +
geom_density(aes(x = value, colour = name)) +
geom_vline(xintercept = c(5, 10, 20), alpha = 0.5, linewidth = 0.5) +
labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "All distances") +
theme_classic()
p2a <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
filter(value < 20) %>%
ggplot() +
geom_density(aes(x = value, colour = name)) +
labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Below 20") +
theme_classic()
p3a <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
filter(value < 10) %>%
ggplot() +
geom_density(aes(x = value, colour = name)) +
labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Below 10") +
theme_classic()
p4a <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
filter(value < 5) %>%
ggplot() +
geom_density(aes(x = value, colour = name)) +
labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Below 5") +
theme_classic()
p1a + p2a + p3a + p4aThere is this weird drop at the right of the distribution when filtering distances.
What happens if we clip the x axis instead of filtering distances? Let’s try it for 20 cm.
df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
ggplot() +
geom_density(aes(x = value, colour = name)) +
geom_vline(xintercept = c(5, 10, 20), alpha = 0.5, linewidth = 0.5) +
scale_x_continuous(limits = c(0, 20)) +
labs(x = "Distance (cm)", y = "Density", colour = "Type", title = "Clip 20") +
theme_classic()Same as filtering distances.
ECDFs
Let’s plot corresponding ECDFs.
p1b <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
ggplot() +
stat_ecdf(aes(x = value, colour = name)) +
geom_vline(xintercept = c(5, 10, 20), alpha = 0.5, linewidth = 0.5) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "All distances") +
theme_classic()
p2b <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
filter(value < 20) %>%
ggplot() +
stat_ecdf(aes(x = value, colour = name)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 20") +
theme_classic()
p3b <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
filter(value < 10) %>%
ggplot() +
stat_ecdf(aes(x = value, colour = name)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 10") +
theme_classic()
p4b <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
filter(value < 5) %>%
ggplot() +
stat_ecdf(aes(x = value, colour = name)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 5") +
theme_classic()
p1b + p2b + p3b + p4bLet’s plot ECDFs alongside with distributions.
(p2a + geom_vline(xintercept = 7.35, alpha = 0.5, linewidth = 0.5, colour = "grey")) +
(p3a + geom_vline(xintercept = 6.15, alpha = 0.5, linewidth = 0.5, colour = "grey")) +
p2b +
p3bHere we notice that the point were distributions meet vary in x depending on distance filtering. Is this expected?
Now we want to have a look at the differences between ECDFs to establish whether organisms are closer or further than expected. Thus, we need to compute the ECDFs, and not only plot them.
Compute ECDFs
ECDFs are computed on a x axis vector from 0 to max(dist) with a length of 1000.
Filtering after
Let’s start by computing the ECDF using all distances and then filter to restrict it to smaller distances.
# Reshape df to have one row per taxon
df_wide <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
pivot_wider(names_from = "name", values_from = "value", values_fn = list)
# Compute ecdf
# Needs a nested call: ecdf(x) returns a function, x being the values of interest
# Call this function with argument x2 to get values along x2
x_axis_seq <- seq(0, 70, length.out = 1000)
plank_ecdf <- ecdf(unlist(df_wide$plankton))(x_axis_seq)
null_ecdf <- ecdf(unlist(df_wide$null))(x_axis_seq)
# Store ECDFs and their difference
ecdfs <- tibble(
x_axis_seq = x_axis_seq, # we need this as the x axis
plank_ecdf = plank_ecdf,
null_ecdf = null_ecdf,
diff = plank_ecdf - null_ecdf
) %>%
mutate(rank = row_number()) # not necessarly needed
# Check that generated ECDFs are the same as the plotted ones
p1c <- ecdfs %>%
select(x_axis_seq, plank_ecdf, null_ecdf) %>%
pivot_longer(plank_ecdf:null_ecdf) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "All distances") +
theme_classic()
p2c <- ecdfs %>%
select(x_axis_seq, plank_ecdf, null_ecdf) %>%
pivot_longer(plank_ecdf:null_ecdf) %>%
filter(x_axis_seq < 20) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 20") +
theme_classic()
p3c <- ecdfs %>%
select(x_axis_seq, plank_ecdf, null_ecdf) %>%
pivot_longer(plank_ecdf:null_ecdf) %>%
filter(x_axis_seq < 10) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 10") +
theme_classic()
p4c <- ecdfs %>%
select(x_axis_seq, plank_ecdf, null_ecdf) %>%
pivot_longer(plank_ecdf:null_ecdf) %>%
filter(x_axis_seq < 5) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = value, colour = name)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type", title = "Below 5") +
theme_classic()
p1c + p2c + p3c + p4cLet’s compare with our previous ECDFs.
Our computed ECDFs were artificially cut so they do not go up to 1 and are shifted. Rescaling them would probably fix this issue.
Computing ECDFs from all distances and then filtering them later on distances does not seem appropriate. Let’s try computing ECDFs from filtered distances.
Filtering before
Second version: we first filter distances and them compute ECDFs on filtered distances.
## Generate ECDFs for all distance thresholds
dist_thres <- tibble(
dist_thr_cm = c(70, 20, 10, 5),
name = c("All distances", "Below 20", "Below 10", "Below 5")
) %>%
mutate(name = factor(name, levels = c("All distances", "Below 20", "Below 10", "Below 5")))
# Loop over thresholds
ecdfs <- lapply(1:nrow(dist_thres), function(i) {
# Get distance threshold and plot name
r <- dist_thres %>% slice(i)
# Generate the wide dataset and apply threshold
df_wide_r <- df_intra_dist_small %>%
filter(taxon == "Doliolida") %>%
filter(value < r$dist_thr_cm) %>%
pivot_wider(names_from = "name", values_from = "value", values_fn = list)
# Compute ECDF on a common x axis sequence
x_axis_seq_r <- seq(0, r$dist_thr_cm, length.out = 1000)
plank_ecdf_r <- ecdf(unlist(df_wide_r$plankton))(x_axis_seq_r)
null_ecdf_r <- ecdf(unlist(df_wide_r$null))(x_axis_seq_r)
# Store results
res <- tibble(
x_axis_seq = x_axis_seq_r,
plank_ecdf = plank_ecdf_r,
null_ecdf = null_ecdf_r
) %>%
mutate(
diff = plank_ecdf - null_ecdf,
dist_thr_cm = r$dist_thr_cm,
name = r$name
)
return(res)
}) %>%
bind_rows()
ecdfs %>%
select(-diff) %>%
pivot_longer(plank_ecdf:null_ecdf, names_to = "type", values_to = "value") %>%
mutate(type == ifelse(type == "plank_ecdf", "plankton", "null")) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = value, colour = type)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type") +
facet_wrap(~name, scales = "free") +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))Looks quite similar as the original ECDFs, let’s keep these ones. Now let’s have a look at differences between ECDFs.
ECDFs differences
ggplot(ecdfs) +
geom_path(aes(x = x_axis_seq, y = diff), colour = "#00BFC4") +
geom_hline(yintercept = 0, colour = "#F8766D") +
labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
facet_wrap(~name, scales = "free") +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))Unexpectedly, the direction of the interaction varies depending on the threshold we used on distances. If we look closer, we notice conserved patterns in plankton ECDFs (blue) across distances, but it does not cross 0 for the same distance, as if the null ECDF (red) was shifted above or below.
Doliolid updated quantiles
Read
load("data/test/04a.doliolida_5_cm.Rdata")
res_05 <- res
res_dist_05 <- res_dist
load("data/test/04a.doliolida_10_cm.Rdata")
res_10 <- res
res_dist_10 <- res_dist
load("data/test/04a.doliolida_20_cm.Rdata")
res_20 <- res
res_dist_20 <- res_dist
# Store results together
res_filt <- res_05 %>% mutate(dist_thr = 5) %>%
bind_rows(res_10 %>% mutate(dist_thr = 10)) %>%
bind_rows(res_20 %>% mutate(dist_thr = 20))
res_filt_dist <- res_dist_05 %>% mutate(dist_thr = 5) %>%
bind_rows(res_dist_10 %>% mutate(dist_thr = 10)) %>%
bind_rows(res_dist_20 %>% mutate(dist_thr = 20)) %>%
rename(plankton = dist, null = dist_rand) %>%
# convert pix to cm
mutate(
plankton = plankton * 51 / 10000,
null = null * 51 / 10000
)Distributions
ECDF
dist_thres <- c(5, 10, 20)
# Loop over thresholds
ecdfs_filt <- lapply(dist_thres, function(t) {
# Get distance threshold and plot name
t_dist <- res_filt_dist %>% filter(dist_thr == t)
# Compute ECDF on a common x axis sequence
x_axis_seq_r <- seq(0, t, length.out = 1000)
plank_ecdf_r <- ecdf(unlist(t_dist$plankton))(x_axis_seq_r)
null_ecdf_r <- ecdf(unlist(t_dist$null))(x_axis_seq_r)
# Store results
res <- tibble(
x_axis_seq = x_axis_seq_r,
plank_ecdf = plank_ecdf_r,
null_ecdf = null_ecdf_r
) %>%
mutate(
diff = plank_ecdf - null_ecdf,
dist_thr_cm = t
)
return(res)
}) %>%
bind_rows()
ecdfs_filt %>%
select(-diff) %>%
pivot_longer(plank_ecdf:null_ecdf, names_to = "type", values_to = "value") %>%
mutate(type == ifelse(type == "plank_ecdf", "plankton", "null")) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = value, colour = type)) +
labs(x = "Distance (cm)", y = "ECDF", colour = "Type") +
facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))ECDFs differences
ggplot(ecdfs_filt) +
geom_path(aes(x = x_axis_seq, y = diff), colour = "#00BFC4") +
geom_hline(yintercept = 0, colour = "#F8766D") +
labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))Compare full quantiles and <20 quantiles
# Store together ECDFs from all quantiles and from <20 quantiles
ecdfs_qt <- ecdfs %>%
select(-name) %>%
mutate(quantile = "full") %>%
bind_rows(ecdfs_filt %>% mutate(quantile = "< 20")) %>%
filter(dist_thr_cm < 70) # drop the case with all distances
ggplot(ecdfs_qt) +
geom_path(aes(x = x_axis_seq, y = diff, colour = quantile), alpha = 0.5) +
geom_hline(yintercept = 0, colour = "red") +
facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
labs(
x = "Distance (cm)", y = "Plankton ECDF - Null ECDF", colour = "Quantiles",
title = "ECDFs dist for various distance threshold and quantile threshold"
)As expected, full quantiles are noisier. Let’s try some smoothing with a moving average.
ecdfs_qt %>%
group_by(dist_thr_cm, quantile) %>%
mutate(mmean = slide(diff, k = 20, fun = weighted.mean, na.rm = TRUE, n = 3, w = cweights(20))) %>%
ungroup() %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = diff, colour = quantile), alpha = 0.2) +
geom_path(aes(x = x_axis_seq, y = mmean, colour = quantile)) +
geom_hline(yintercept = 0, colour = "red") +
facet_wrap(~dist_thr_cm, scales = "free", ncol = 2) +
labs(
x = "Distance (cm)", y = "Plankton ECDF - Null ECDF", colour = "Quantiles",
title = "ECDFs dist for various distance threshold and quantile threshold"
)All organisms: full quantiles and various distance thresholds
Read distances
## Intra distances
# list processed files
processed <- list.files("data", pattern = "04\\.", full.names = TRUE)
# load data
res <- sapply(processed, function(x) mget(load(x)), simplify = TRUE)
# get summary data (1st line)
df_intra <- res[1,] %>% bind_rows()
# get distances (2nd line)
df_intra_dist <- res[2,] %>%
bind_rows() %>%
# convert from px to cm
mutate(
dist = dist * 51 / 10000,
dist_rand = dist_rand * 51 / 10000
)
## Drop colonial Collodaria because of segmentation artifact
df_intra <- df_intra %>% filter(!str_detect(taxon, "Collodaria_colonial"))
df_intra_dist <- df_intra_dist %>% filter(!str_detect(taxon, "Collodaria_colonial"))
## Assign colour and shape to each taxon
df_intra <- df_intra %>%
mutate(
colour = as.character(paletteer_d(`"khroma::discreterainbow"`, n = 27)),
shape = rep(21:25, 6)[1:27]
)
# List taxonomic groups
taxa <- unique(df_intra_dist$taxon)Filter distances and compute ECDFs
# Loop over taxonomic groups
ecdf_dist_all <- lapply(taxa, function(ta) {
# Get distances for this taxon
df_ta <- df_intra_dist %>% filter(taxon == ta)
# Loop over distance thresholds
lapply(dist_thres, function(d) {
# Keep only distances below threshold
df_ta_d <- df_ta %>%
pivot_longer(dist:dist_rand) %>%
filter(value < d)
# Compute ECDF
x_axis_seq <- seq(0, d, length.out = 1000)
plank_dist <- df_ta_d %>% filter(name == "dist") %>% pull(value)
null_dist <- df_ta_d %>% filter(name == "dist_rand") %>% pull(value)
plank_ecdf <- ecdf(plank_dist)(x_axis_seq)
null_ecdf <- ecdf(null_dist)(x_axis_seq)
# Store results
res <- tibble(
taxon = rep(ta, length.out = length(x_axis_seq)),
dist_thr = rep(d, length.out = length(x_axis_seq)) %>% as.factor(),
x_axis_seq = x_axis_seq,
plank_ecdf = plank_ecdf,
null_ecdf = null_ecdf
) %>%
mutate(
# compute difference
diff = plank_ecdf - null_ecdf,
# smooth it
diff_sm = slide(diff, k = 20, fun = weighted.mean, na.rm = TRUE, n = 3, w = cweights(20)),
# compute sign of deviation
dir = sign(diff_sm) %>% as.factor()
)
return(res)
}) %>%
bind_rows()
}) %>%
bind_rows()
# Generate a summary for each distance threshold
# - prop closer in each dir
ecdf_dist_all_summ <- ecdf_dist_all %>%
group_by(taxon, dist_thr) %>%
summarise(
uni_d = length(unique(dir)) == 1,
prop_closer = sum(dir == 1) / n(),
sum_ecdf = sum(diff_sm),
.groups = "drop"
)Plot differences in ECDFs for all thresholds
Proportion of organisms that are closer
ggplot(ecdf_dist_all_summ) +
geom_raster(aes(x = taxon, y = dist_thr, fill = prop_closer)) +
geom_point(data = ecdf_dist_all_summ %>% filter(uni_d), aes(x = taxon, y = dist_thr)) +
scale_fill_gradient2(
low = "#d7191c", high = "#2c7bb6", mid = "#ffffbf",
midpoint = 0.5, limits = c(0, 1)
) +
coord_flip()Sum of ECDF differences
Distribution of ECDF differences
ecdf_dist_all %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = diff_sm)) +
geom_area(aes(x = x_axis_seq, y = ifelse(diff_sm < 0, diff_sm, 0)), fill = "#d7191c") +
geom_area(aes(x = x_axis_seq, y = ifelse(diff_sm > 0, diff_sm, 0)), fill = "#2c7bb6") +
geom_hline(yintercept = 0, colour = "grey") +
labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
facet_grid(taxon~dist_thr, scales = "free", switch = 'y') +
theme(
axis.text.y = element_blank(),
strip.text.y.left = element_text(angle = 0)
)Try identifying patterns in ECDFs differences for the 10 cm threshold
For this we used non smoothed differences.
ecdf_10 <- ecdf_dist_all %>% filter(dist_thr == 10)#%>% filter(taxon %in% taxa[1:5])
ecdf_10 %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = diff)) +
#geom_area(aes(x = x_axis_seq, y = ifelse(diff < 0, diff, 0)), fill = "#d7191c") +
#geom_area(aes(x = x_axis_seq, y = ifelse(diff > 0, diff, 0)), fill = "#2c7bb6") +
geom_hline(yintercept = 0, colour = "grey") +
labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
facet_grid(taxon~dist_thr, scales = "free", switch = 'y') +
theme(
axis.text.y = element_blank(),
strip.text.y.left = element_text(angle = 0)
)ecdf_10 %>%
group_by(taxon, dist_thr) %>%
mutate(
mmed = slide(diff, k = 50, fun = median, na.rm = TRUE, n = 5),
) %>%
ungroup() %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = mmed)) +
#geom_area(aes(x = x_axis_seq, y = ifelse(mmed < 0, mmed, 0)), fill = "#d7191c") +
#geom_area(aes(x = x_axis_seq, y = ifelse(mmed > 0, mmed, 0)), fill = "#2c7bb6") +
geom_hline(yintercept = 0, colour = "grey") +
labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
facet_wrap(~taxon, ncol = 3, scales = "free") + #strip.position
theme(
axis.text.y = element_blank(),
strip.text.y.left = element_text(angle = 0)
)Try identifying a distance threshold
Doliolids
Coarse
Start with coarse thresholds: 10, 20, 30, 40, 50 and 60.
# Define list of thresholds to try
thresholds_c <- seq(5, 60, by = 5)
# Loop over thresholds, filter distances and perform Kuiper test
dol_thres_c <- lapply(thresholds_c, function(thres) {
# Filter distances
thres_dist <- df_intra_dist %>%
filter(taxon == "Doliolida") %>%
pivot_longer(dist:dist_rand) %>%
filter(value < thres)
# Extract distances
d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
# Perform Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
tibble(
taxon = "Doliolida",
dist_thres = thres,
n_qt_p = length(d1),
n_qt_r = length(d2),
kuiper_stat = kt[1]
)
}) %>%
bind_rows()
# Join with total number of distances
dol_thres_c <- dol_thres_c %>%
left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>%
mutate(
# compute number of distances from total distances and proportion of quantiles
n_dist_p = round(n_dist * (n_qt_p / 10000)),
n_dist_r = round(n_dist * (n_qt_r / 10000))
)
# Plot results
ggplot(dol_thres_c) +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
theme_classic()ggplot(dol_thres_c) +
geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
labs(x = "Number of distances", y = "Kuiper statistic") +
theme_classic()Fine
Let’s focus on distances < 20 cm and try to refine the threshold.
# Define list of thresholds to try
thresholds_f <- seq(4, 20, by = 2)
# Loop over thresholds, filter distances and perform Kuiper test
dol_thres_f <- lapply(thresholds_f, function(thres) {
# Filter distances
thres_dist <- df_intra_dist %>%
filter(taxon == "Doliolida") %>%
pivot_longer(dist:dist_rand) %>%
filter(value < thres)
# Extract distances
d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
# Perform Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
tibble(
taxon = "Doliolida",
dist_thres = thres,
n_qt_p = length(d1),
n_qt_r = length(d2),
kuiper_stat = kt[1]
)
}) %>%
bind_rows()
# Join with total number of distances
dol_thres_f <- dol_thres_f %>%
left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>%
mutate(
# compute number of distances from total distances and proportion of quantiles
n_dist_p = round(n_dist * (n_qt_p / 10000)),
n_dist_r = round(n_dist * (n_qt_r / 10000))
)
# Plot results
ggplot(dol_thres_f) +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
theme_classic()ggplot(dol_thres_f) +
geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
labs(x = "Number of distances", y = "Kuiper statistic") +
theme_classic()There is a threshold (12 cm) for which there is a maximum of information.
Let’s try this for all plankton groups, starting with the coarse thresholding.
All taxa
Coarse
# Define list of thresholds to try
thresholds_c <- seq(5, 60, by = 5)
# Generate a combination of taxa by thresholds to try
comb <- crossing(taxon = taxa, thres = thresholds_c)
# Loop over combination of thresholds and taxa
taxa_thres_c <- mclapply(1:nrow(comb), function(i) {
# Get row of interest
r <- comb %>% slice(i)
# Threshold distances
thres_dist <- df_intra_dist %>%
filter(taxon == r$taxon) %>%
pivot_longer(dist:dist_rand) %>%
filter(value < r$thres)
# Extract distances
d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
# Perform Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
tibble(
taxon = r$taxon,
dist_thres = r$thres,
n_qt_p = length(d1),
n_qt_r = length(d2),
kuiper_stat = kt[1]
)
}, mc.cores = n_cores) %>%
bind_rows()
# Join with total number of distances
taxa_thres_c <- taxa_thres_c %>%
left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>%
mutate(
# compute number of distances from total distances and proportion of quantiles
n_dist_p = round(n_dist * (n_qt_p / 10000)),
n_dist_r = round(n_dist * (n_qt_r / 10000))
)
# Plot results
ggplot(taxa_thres_c) +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
facet_wrap(~taxon, scales = "free") +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))ggplot(taxa_thres_c) +
geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
labs(x = "Number of distances", y = "Kuiper statistic") +
facet_wrap(~taxon, scales = "free") +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))For many taxonomic groups we can identify a maximum of information around 5-10 cm. Let’s try refining that threshold between 4 and 20 cm.
Fine
# Define list of thresholds to try
thresholds_f <- seq(4, 20, by = 2)
# Generate a combination of taxa by thresholds to try
comb <- crossing(taxon = taxa, thres = thresholds_f)
# Loop over combination of thresholds and taxa
taxa_thres_f <- mclapply(1:nrow(comb), function(i) {
# Get row of interest
r <- comb %>% slice(i)
# Threshold distances
thres_dist <- df_intra_dist %>%
filter(taxon == r$taxon) %>%
pivot_longer(dist:dist_rand) %>%
filter(value < r$thres)
# Extract distances
d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
# Perform Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
tibble(
taxon = r$taxon,
dist_thres = r$thres,
n_qt_p = length(d1),
n_qt_r = length(d2),
kuiper_stat = kt[1]
)
}, mc.cores = n_cores) %>%
bind_rows()
# Join with total number of distances
taxa_thres_f <- taxa_thres_f %>%
left_join(df_intra %>% select(taxon, n_dist), by = join_by(taxon)) %>%
mutate(
# compute number of distances from total distances and proportion of quantiles
n_dist_p = round(n_dist * (n_qt_p / 10000)),
n_dist_r = round(n_dist * (n_qt_r / 10000))
)
# Plot results
ggplot(taxa_thres_f) +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
facet_wrap(~taxon, scales = "free") +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))ggplot(taxa_thres_f) +
geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
labs(x = "Number of distances", y = "Kuiper statistic") +
facet_wrap(~taxon, scales = "free") +
theme_classic() +
theme(strip.background = element_rect(colour = "white"))The result is not satisfying for all groups. Let’s try the same approach on overall distances.
All distances
First we need to load this data.
load("data/03.all_distances.Rdata")
# Extract distances
df_all_dist <- df_all %>%
select(dist, dist_rand) %>%
unnest(c(dist, dist_rand))
df_all <- df_all %>% select(-c(dist, dist_rand))Coarse
# Define list of thresholds to try
thresholds_c <- seq(5, 60, by = 5)
# Loop over thresholds, filter distances and perform Kuiper test
all_thres_c <- lapply(thresholds_c, function(thres) {
# Filter distances
thres_dist <- df_all_dist %>%
pivot_longer(dist:dist_rand) %>%
mutate(value = value * 51 / 10000) %>%
filter(value < thres)
# Extract distances
d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
# Perform Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
tibble(
taxon = "all",
dist_thres = thres,
n_qt_p = length(d1),
n_qt_r = length(d2),
kuiper_stat = kt[1]
)
}) %>%
bind_rows()
# Join with total number of distances
all_thres_c <- all_thres_c %>%
left_join(df_all %>% select(taxon, n_dist), by = join_by(taxon)) %>%
mutate(
# compute number of distances from total distances and proportion of quantiles
n_dist_p = round(n_dist * (n_qt_p / 10000)),
n_dist_r = round(n_dist * (n_qt_r / 10000))
)
# Plot results
ggplot(all_thres_c) +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
theme_classic()ggplot(all_thres_c) +
geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
labs(x = "Number of distances", y = "Kuiper statistic") +
theme_classic()Great, we have a maximum of information for a given threshold!
Let’s try refining it!
Fine
# Define list of thresholds to try
thresholds_f <- seq(4, 20, by = 2)
# Loop over thresholds, filter distances and perform Kuiper test
all_thres_f <- lapply(thresholds_f, function(thres) {
# Filter distances
thres_dist <- df_all_dist %>%
pivot_longer(dist:dist_rand) %>%
mutate(value = value * 51 / 10000) %>%
filter(value < thres)
# Extract distances
d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
# Perform Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
tibble(
taxon = "all",
dist_thres = thres,
n_qt_p = length(d1),
n_qt_r = length(d2),
kuiper_stat = kt[1]
)
}) %>%
bind_rows()
# Join with total number of distances
all_thres_f <- all_thres_f %>%
left_join(df_all %>% select(taxon, n_dist), by = join_by(taxon)) %>%
mutate(
# compute number of distances from total distances and proportion of quantiles
n_dist_p = round(n_dist * (n_qt_p / 10000)),
n_dist_r = round(n_dist * (n_qt_r / 10000))
)
# Extract the max of information
max_info <- all_thres_f %>% arrange(desc(kuiper_stat)) %>% slice_head(n = 1)
max_info# A tibble: 1 × 8
taxon dist_thres n_qt_p n_qt_r kuiper_stat n_dist n_dist_p n_dist_r
<chr> <dbl> <int> <int> <dbl> <int> <dbl> <dbl>
1 all 12 3038 3001 0.0168 483179289 146789868 145002105
# Plot results
ggplot(all_thres_f) +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
geom_vline(xintercept = max_info$dist_thres, colour = "red") +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
theme_classic()ggplot(all_thres_f) +
geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
geom_vline(xintercept = max_info$n_dist_p, colour = "red") +
labs(x = "Number of distances", y = "Kuiper statistic") +
theme_classic()Try refining even further.
Very fine
# Define list of thresholds to try
thresholds_vf <- seq(9, 15, by = 0.5)
# Loop over thresholds, filter distances and perform Kuiper test
all_thres_vf <- lapply(thresholds_vf, function(thres) {
# Filter distances
thres_dist <- df_all_dist %>%
pivot_longer(dist:dist_rand) %>%
mutate(value = value * 51 / 10000) %>%
filter(value < thres)
# Extract distances
d1 <- thres_dist %>% filter(name == "dist") %>% pull(value) # plankton
d2 <- thres_dist %>% filter(name == "dist_rand") %>% pull(value) # null
# Perform Kuiper test
kt <- kuiper_test(d1, d2)
# Return results
tibble(
taxon = "all",
dist_thres = thres,
n_qt_p = length(d1),
n_qt_r = length(d2),
kuiper_stat = kt[1]
)
}) %>%
bind_rows()
# Join with total number of distances
all_thres_vf <- all_thres_vf %>%
left_join(df_all %>% select(taxon, n_dist), by = join_by(taxon)) %>%
mutate(
# compute number of distances from total distances and proportion of quantiles
n_dist_p = round(n_dist * (n_qt_p / 10000)),
n_dist_r = round(n_dist * (n_qt_r / 10000))
)
# Extract the max of information
max_info_vf <- all_thres_vf %>% arrange(desc(kuiper_stat)) %>% slice_head(n = 1)
max_info_vf# A tibble: 1 × 8
taxon dist_thres n_qt_p n_qt_r kuiper_stat n_dist n_dist_p n_dist_r
<chr> <dbl> <int> <int> <dbl> <int> <dbl> <dbl>
1 all 11 2770 2729 0.0170 483179289 133840663 131859628
# Plot results
ggplot(all_thres_vf) +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
geom_vline(xintercept = max_info_vf$dist_thres, colour = "red") +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
theme_classic()ggplot(all_thres_vf) +
geom_path(aes(x = n_dist_p, y = kuiper_stat)) +
geom_vline(xintercept = max_info_vf$n_dist_p, colour = "red") +
labs(x = "Number of distances", y = "Kuiper statistic") +
theme_classic()Let’s focus on distances < 11 cm.
Let’s check this threshold and number of distances for all taxa.
# Get the identified threshold for each taxonomic group
taxa_thres_f_max <- taxa_thres_f %>%
group_by(taxon) %>%
filter(kuiper_stat == max(kuiper_stat)) %>%
ungroup()
ggplot(taxa_thres_f_max) +
geom_point(aes(x = n_dist_p, y = dist_thres)) +
scale_x_log10() +
labs(x = "Number of distances", y = "Distance threshold (cm)")ggplot(taxa_thres_f_max) +
geom_point(aes(x = n_dist_p, y = kuiper_stat, colour = dist_thres)) +
scale_colour_viridis_c() +
scale_x_log10() + scale_y_log10() +
labs(x = "Number of distances", y = "Kuiper statistic", colour = "Distance threshold (cm)")A distance threshold can be identified only if enough distances are present. Recomputing distances and saving quantiles < 20 cm could help identify more thresholds.
What about the blue point at 10⁵ distances?
taxa_thres_f_max %>% filter(n_dist_p > 10000 & dist_thres == 4)# A tibble: 1 × 8
taxon dist_thres n_qt_p n_qt_r kuiper_stat n_dist n_dist_p n_dist_r
<chr> <dbl> <int> <int> <dbl> <int> <dbl> <dbl>
1 Rhizaria 4 590 579 0.0167 1520322 89699 88027
# It’s Rhizaria
taxa_thres_f %>%
filter(taxon == "Rhizaria") %>%
ggplot() +
geom_path(aes(x = dist_thres, y = kuiper_stat)) +
#geom_vline(xintercept = max_info$dist_thres, colour = "red") +
labs(x = "Distance threshold (cm)", y = "Kuiper statistic") +
theme_classic()Now that we have a distance threshold, apply it to all taxonomic groups and try to identify patterns in ECDF differences.
Apply the 11 cm threshold
We might need to recompute distances to store quantiles below a given threshold, probably a little bit higher than 11 cm.
Process plankton distances
dist_thr <- 11
# Loop over taxonomic groups, perform Kuiper test, compute ECDF
df_intra_11 <- lapply(taxa, function(ta) {
# Get distances for this taxon
df_ta <- df_intra_dist %>%
filter(taxon == ta) %>%
# Keep only distances below threshold
pivot_longer(dist:dist_rand) %>%
filter(value < dist_thr)
plank_dist <- df_ta %>% filter(name == "dist") %>% pull(value)
null_dist <- df_ta %>% filter(name == "dist_rand") %>% pull(value)
n_dist <- (length(plank_dist) + length(null_dist))/2
# Kuiper test
kt <- kuiper_test(plank_dist, null_dist)
# Compute ECDF
x_axis_seq <- seq(0, dist_thr, length.out = 1000)
plank_ecdf <- ecdf(plank_dist)(x_axis_seq)
null_ecdf <- ecdf(null_dist)(x_axis_seq)
# Store results
res <- tibble(
taxon = ta,
dist_thr = dist_thr,
n_dist_small_qt = n_dist,
test_stat = kt[1],
p_value = kt[2],
x_axis_seq = list(x_axis_seq),
plank_ecdf = list(plank_ecdf),
null_ecdf = list(null_ecdf)
)
# Return result
return(res)
}) %>%
bind_rows()
# Extract ECDFs, compute difference and smooth it
df_intra_ecdf_11 <- df_intra_11 %>%
select(taxon, x_axis_seq, plank_ecdf, null_ecdf) %>%
unnest(c(x_axis_seq, plank_ecdf, null_ecdf)) %>%
group_by(taxon) %>%
mutate(
diff = plank_ecdf - null_ecdf,
diff_sm = slide(diff, k = 20, fun = weighted.mean, na.rm = TRUE, n = 3, w = cweights(20))
) %>%
ungroup()
# Get total number of distances below threshold
df_intra_11 <- df_intra_11 %>%
select(-c(x_axis_seq, plank_ecdf, null_ecdf)) %>%
left_join(df_intra %>% select(taxon, n_dist_tot = n_dist, colour, shape), by = join_by(taxon)) %>%
mutate(
n_dist_small = (n_dist_small_qt / 10000) * n_dist_tot,
log_n_dist_small = log10(n_dist_small),
log_test_stat = log10(test_stat)
) %>%
filter(n_dist_small > 100)
# Detect taxa which are above the ribbon and have the minimum number of distances required
df_intra_11 <- df_intra_11 %>%
mutate(
above = log_test_stat > log_n_dist_small * (rq_coef %>% filter(tau == 0.95) %>% pull(slope)) + (rq_coef %>% filter(tau == 0.95) %>% pull(intercept)), # above the polygon
keep = n_dist_small > n_dist_min, # enough distances
above = above & keep # combination of both
) %>%
select(-keep)
df_intra_11_above <- df_intra_11 %>% filter(above) # needed to keep the same colour and shape across plotsPlot Kuiper stat VS number of distances
# With only taxa above ribbon
ggplot() +
geom_boxplot(data = f_val_dist, aes(x = log_n_dist, y = log_test_stat, group = log_n_dist), colour = "gray", outlier.shape = NA) +
geom_polygon(data = rib_data, aes(x = log_n_dist, y = y), alpha = 0.1) +
geom_point(data = df_intra_11 %>% filter(above), aes(x = log_n_dist_small, y = log_test_stat, fill = taxon, colour = taxon, shape = taxon)) +
geom_point(data = df_intra_11 %>% filter(!above), aes(x = log_n_dist_small, y = log_test_stat), colour = "gray") +
scale_fill_manual(values = df_intra_11_above$colour) +
scale_colour_manual(values = df_intra_11_above$colour) +
scale_shape_manual(values = df_intra_11_above$shape) +
scale_x_continuous(labels = label_math(expr = 10^.x, format = force), breaks = seq(2, 8, by = 2)) +
scale_y_continuous(labels = label_math(expr = 10^.x, format = force)) +
labs(x = "N distances", y = "Kuiper statistic", colour = "Taxon", fill = "Taxon", shape = "Taxon") Plot ECDFs
For all taxonomic groups.
df_intra_ecdf_11 %>%
mutate(random = ifelse(taxon %in% df_intra_11_above$taxon, "Non random", "Random")) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = diff_sm, colour = random)) +
geom_hline(yintercept = 0, colour = "grey") +
labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
facet_wrap(~taxon, ncol = 3, scales = "free") + #strip.position
theme(
axis.text.y = element_blank(),
strip.text.y.left = element_text(angle = 0)
)And only for taxonomic groups which have non random distribution of distances.
df_intra_ecdf_11 %>%
filter(taxon %in% df_intra_11_above$taxon) %>%
ggplot() +
geom_path(aes(x = x_axis_seq, y = diff_sm)) +
geom_hline(yintercept = 0, colour = "grey") +
labs(x = "Distance (cm)", y = "Plankton ECDF - Null ECDF") +
facet_wrap(~taxon, ncol = 3, scales = "free") + #strip.position
theme(
axis.text.y = element_blank(),
strip.text.y.left = element_text(angle = 0)
)Let’s try to define groups based of these ECDFs:
Ctenophora + Rhizaria
All others